# !sudo pip install catboost plotly

from IPython.lib.display import YouTubeVideo
YouTubeVideo('Pa-q5elS_nE')
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import plotly.offline as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
import tqdm.notebook
py.init_notebook_mode(connected=True)
EQUAL_ASPECT_RATIO_LAYOUT = dict(
margin={
'l': 0,
'r': 0,
'b': 0,
't': 0
}, scene=dict(
aspectmode='data'
))
def color(x, cmap='Reds'):
cmap = plt.get_cmap(cmap)
x = (x - np.min(x)) / np.max(x)
return cmap(x)
%matplotlib inline
from joblib import Parallel, delayed, cpu_count
from sklearn.model_selection import train_test_split
ds = pd.read_csv('./snow.csv')
ds = ds.set_index(['scene_id'])
ds.head()

scene = ds.loc[0]
fig = go.Figure(layout=EQUAL_ASPECT_RATIO_LAYOUT)
fig.add_scatter3d(**{
'x': scene.x,
'y': scene.y,
'z': scene.z,
'mode': 'markers',
'marker': {
'size': 1,
'color': color(scene.ring, 'tab20'),
},
'text': scene.ring
})
py.iplot(fig)

fig = go.Figure(layout=EQUAL_ASPECT_RATIO_LAYOUT)
fig.add_scatter3d(**{
'x': scene.x,
'y': scene.y,
'z': scene.z,
'mode': 'markers',
'marker': {
'size': 1,
'color': color(scene.intensity, 'seismic'),
},
'text': scene.intensity
})
py.iplot(fig)
scene = ds.loc[1]
fig = go.Figure(layout=EQUAL_ASPECT_RATIO_LAYOUT)
fig.add_scatter3d(**{
'x': scene.x,
'y': scene.y,
'z': scene.z,
'mode': 'markers',
'marker': {
'size': 1,
'color': color(scene.intensity, 'seismic'),
},
'text': scene.ring
})
py.iplot(fig)
def filter_by_intensity(intensity, limit=3):
return intensity > limit
filtered_scene = scene[filter_by_intensity(scene.intensity)]
fig = go.Figure(layout=EQUAL_ASPECT_RATIO_LAYOUT)
fig.add_scatter3d(**{
'x': filtered_scene.x,
'y': filtered_scene.y,
'z': filtered_scene.z,
'mode': 'markers',
'marker': {
'size': 1,
'color': color(filtered_scene.intensity, 'seismic'),
},
'text': scene.ring
})
py.iplot(fig)
Плохо и непонятно, будем учить
from sklearn.neighbors import KDTree
class ComputeFeatures(object):
def __init__(self, r=1.0):
self.xyz = None
self.intensity = None
self.ring = None
self.index = None
self.r = r
def _feature_names(self):
return {
'neighbours_in_ring': None, # количество соседей в окрестности с тем же номером кольца
'neighbours_not_in_ring': None, # количетво соседей в окрестности с другим номером кольца
'min_neighbours_intensity': None, # минимальная интенсивность соседей
'max_neighbours_intensity': None, # максимальная интенсивность соседей
'mean_neighbours_intensity': None, # средняя интенсивность соседей
'std_neighbours_intensity': None, # отклонение интенсивности соседей
'min_distance_to_neighbours': None, # минимальная дистанция до соседей
'max_distance_to_neighbours': None, # максимальная дистанция до соседей
'mean_distance_to_neighbours': None, # средняя дистанция до соседей
'std_distance_to_neighbours': None # отклонение дистанции до соседей,
}
def compute_point_features(self, point_id, neighbours, distances):
features = self._feature_names()
neighbours_ring = self.ring[neighbours]
features['neighbours_in_ring'] = len(np.where(neighbours_ring == self.ring[point_id])[0])
features['neighbours_not_in_ring'] = len(neighbours_ring) - features['neighbours_in_ring']
stats = [('min', np.min), ('max', np.max), ('mean', np.mean), ('std', np.std)]
for name, func in stats:
features[f'{name}_neighbours_intensity'] = func(self.intensity[neighbours])
features[f'{name}_distance_to_neighbours'] = func(distances)
return features
def get_point_neighbours(self, point_id):
return self.index.query_radius(self.xyz[point_id][np.newaxis, :], r=self.r, return_distance=True)
def __call__(self, xyz, intensity, ring, label, ds_cols):
self.xyz = xyz[:]
self.intensity = intensity[:]
self.ring = ring[:]
self.index = KDTree(self.xyz)
features = []
for point_id in range(len(self.xyz)):
neighbours, distances = self.get_point_neighbours(point_id)
features.append(self.compute_point_features(point_id, neighbours[0], distances[0]))
names = list(self._feature_names().keys()) + list(ds_cols)
ds_data = pd.concat([pd.DataFrame(data=self.xyz), pd.DataFrame(data=intensity), pd.DataFrame(data=ring),
pd.DataFrame(data=label)], axis=1)
features_data = pd.DataFrame(data=features)
data = pd.concat([features_data, ds_data], axis=1)
data = pd.DataFrame(data=data.values, columns=names)
return data
# ds_features = pd.read_csv('./snow_features.csv')
# ds_features = ds_features.drop(["Unnamed: 0"], axis=1)
# ds_features.shape
R = 1.0
def process_scene(scene_id, scene, r):
features = ComputeFeatures(r=r)
features_df = \
features(scene[['x', 'y', 'z']].values, scene.intensity.values, scene.ring.values,
scene.label.values, scene.columns)
features_df.to_csv('./features/{}.csv'.format(scene_id))
with Parallel(cpu_count()) as pool:
pool(
delayed(process_scene)(scene_id, scene=ds.loc[scene_id], r=R)
for scene_id in tqdm.tqdm(ds.reset_index().scene_id.unique())
)
scene = ds.loc[1]
fig = go.Figure(layout=EQUAL_ASPECT_RATIO_LAYOUT)
fig.add_scatter3d(**{
'x': scene.x,
'y': scene.y,
'z': scene.z,
'mode': 'markers',
'marker': {
'size': 1,
'color': color(scene.label, 'seismic'),
},
'text': scene.label
})
py.iplot(fig)
def get_features(ids):
data = []
for scene_id in ids:
df = pd.read_csv(f'features/{scene_id}.csv')
df['scene_id'] = [scene_id] * len(df)
data.append(df)
data = pd.concat(data, sort=False)
return data.drop(['Unnamed: 0'], axis=1)
ids = ds.reset_index().scene_id.unique()
train_ids, test_ids = train_test_split(ids, test_size=0.2)
train_ids, val_ids = train_test_split(train_ids, test_size=0.3)
train = get_features(train_ids)
test = get_features(test_ids)
val = get_features(val_ids)
print(train.columns)
train
import catboost
def learn(X_train, X_val, y_train, y_val):
clf = catboost.CatBoostClassifier(n_estimators=100)
clf.fit(
X_train, y_train, early_stopping_rounds=10,
use_best_model=True, eval_set=(X_val.values, y_val.values), plot=True, verbose=False)
return clf
X_train = train.drop(["scene_id", "label", "x", "y", "z"], axis=1)
y_train = train.label
X_val = val.drop(["scene_id", "label", "x", "y", "z"], axis=1)
y_val = val.label
del ds
cls = learn(X_train, X_val, y_train, y_val)
X_test = test.drop(['scene_id', 'x', 'y', 'z', 'label'], axis=1)
y_test = test.label
from sklearn.metrics import precision_recall_curve, precision_score, recall_score
def test_one(clf, X_test, y_test):
y_test_hat = clf.predict_proba(X_test)
pr, rec, thr = precision_recall_curve(y_test, y_test_hat[:, 1])
ix = np.linspace(1, len(pr)-1, num=2000).astype(int)
return pr[ix], rec[ix], thr[ix - 1]
def heuristic_filter_scoring():
pr = []
rec = []
filter_range = range(1, 10)
for i in filter_range:
y_test_heuristic_hat = np.ones(len(X_test))
y_test_heuristic_hat[filter_by_intensity(test.intensity, i)] = 0
pr.append(precision_score(y_test, y_test_heuristic_hat))
rec.append(recall_score(y_test, y_test_heuristic_hat))
return pr, rec, ' '.join(map(str, list(filter_range)))
pr_bl, rec_bl, thr_bl = heuristic_filter_scoring()
def plot_pr_rec(*models):
traces = []
for model, clf, X_test, y_test in models:
pr, rec, thr = test_one(clf, X_test, y_test)
pr_rec = go.Scattergl(x = rec, y = pr, mode='lines', text=thr, name=model)
traces.append(pr_rec)
pr_rec_bl = go.Scatter(x = rec_bl, y = pr_bl, mode='lines+markers', text=thr_bl, name='Intensity BL')
layout = go.Layout(
title='Precission-recall',
xaxis=dict(
title='Recall'
),
yaxis=dict(
title='Precission'
))
fig = go.Figure(
data=traces + [pr_rec_bl],
layout=layout)
py.iplot(fig)
models = [('Catboost classifier', cls, X_test, y_test)]
plot_pr_rec(*models)
y_test_hat = cls.predict_proba(test.drop(['scene_id', 'x', 'y', 'z', 'label'], axis=1))
itest = test.set_index(['scene_id'])
scene_id = 285
scene = itest.loc[scene_id]
scene_predictions = y_test_hat[test.scene_id == scene_id][:, 1]
fig = go.Figure(layout=EQUAL_ASPECT_RATIO_LAYOUT)
preds = (np.round(scene_predictions) == scene.label).astype('int')
colors = []
fig.add_scatter3d(**{
'x': scene.x,
'y': scene.y,
'z': scene.z,
'mode': 'markers',
'marker': {
'size': 3,
'opacity': 1,
'color': color(preds, 'bwr'),
},
'text': [f'real: {target}, predicted: {pred}' for target, pred in zip(scene.label, preds)]
})
py.iplot(fig)